Library loading and set up

library(tidyverse)
library(DESeq2)
library(knitr)
library(kableExtra)
library(plyranges)
library(stringr)
library(biobroom)
library(ggrepel)
library(RColorBrewer)
library(gplots)
library(biomaRt)
library(ChIPseeker)
library(TxDb.Mmusculus.UCSC.mm10.knownGene)
library(mixOmics)
library(mosaic)
library(clusterProfiler)
library(DOSE)
library(org.Mm.eg.db)
library(pathview)
library(AnnotationDbi)
library(fgsea)
library(rvest)
library(ggseqlogo)
library(gridExtra)
library(rtracklayer)
library(Biostrings)
library(Rsamtools)
library(data.table)
library(grid)
library(enrichplot)

Load dataset

This dataset is generated using WT and KRas-G12D expressingn colonic tumor from mouse. Tumors were induced via 4-OHT enema. Mice used are - Villin-CreER/+; Apc(fl/fl) - Villin-CreER/+; Apc(fl/fl); KRasG12D/+

Alignment and peak calling was done using the bcbio ATAC-Seq pipeline following instructions described here: (https://bcbio-nextgen.readthedocs.io/en/latest/contents/atac.html)

Merged peak data are used here. This is obtained by combining all peaks from all samples, merging peaks within 500bp of each other, and using featureCount to count all reads in the peaks.

dir.create("PDF_Figure", showWarnings = FALSE)

metadata = readr::read_csv("Data/bcbio_output/metadata.csv") %>%
  dplyr::rename("sample"=...1) %>%
  dplyr::select(-batch, -phenotype) %>%
  dplyr::filter(tissue == "colon_tumor") %>%
  mutate(condition=relevel(factor(condition), ref="WT")) %>%
  as.data.frame()
rownames(metadata) = metadata$sample
counts = readr::read_tsv("Data/bcbio_output/merged/merged-counts.tsv") %>%
  tibble::column_to_rownames("id")

Metadata

metadata %>%
  kable() %>%
  kable_styling()
sample condition replicate tissue
WT_tumor_rep1 WT_tumor_rep1 WT rep1 colon_tumor
WT_tumor_rep2 WT_tumor_rep2 WT rep2 colon_tumor
WT_tumor_rep3 WT_tumor_rep3 WT rep3 colon_tumor
Kras_tumor_rep1 Kras_tumor_rep1 Kras rep1 colon_tumor
Kras_tumor_rep2 Kras_tumor_rep2 Kras rep2 colon_tumor
Kras_tumor_rep3 Kras_tumor_rep3 Kras rep3 colon_tumor

Load Ensembl ID dataset

ensembl = useMart("ensembl", dataset="mmusculus_gene_ensembl")
symbols = getBM(attributes=c("entrezgene_id", "mgi_symbol", "ensembl_gene_id"), mart=ensembl) %>%
  dplyr::rename("geneId"=entrezgene_id) %>%
  mutate(geneId=as.character(geneId))

save(symbols, file = "Data/symbols.rda")

Cleanup

We need to do a little it of cleanup before contining. We need to remove peaks that appear in regions known to be false positive machines, called the blacklist regions. We also are interested in regions that could be affecting expression via chromatin accessability, so we’ll only consider peaks within a window around the transcription start sites of genes.

Blacklist region removal

load("Data/symbols.rda")

expand_region_string = function(df, column) {
  df = as.data.frame(df)
  tokens = str_split_fixed(df[,column], ":", 2)
  chrom = tokens[,1]
  start = as.numeric(str_split_fixed(tokens[,2], "-", 2)[, 1])
  end = as.numeric(str_split_fixed(tokens[,2], "-", 2)[, 2])
  regions = data.frame(seqnames=chrom, start=start, end=end)
  df %>%
    bind_cols(regions)
}
blacklist = readr::read_tsv("Data/mm10-blacklist.v2.bed.gz", col_names=c("seqnames", "start", "end")) %>%
  as_granges()
blacklist_peaks = counts %>%
  tibble::rownames_to_column("peak") %>%
  expand_region_string("peak") %>%
  as_granges() %>%
  join_overlap_inner(blacklist) %>%
  tidy() %>%
  pull(peak)
counts = counts[!rownames(counts) %in% blacklist_peaks, ]

A few peaks overlap the ENCODE blacklist regions, so we removed them from the analysis. There were 8440 that overlapped with the blacklist regions.

Open region annotation

Here we annotate the peaks that we called with contextual genomic information. We can also see that some peaks are annotated in multiple regions, this can’t be helped as genes overlap so there are areas where multiple features of genes overlap.

annotatedobj = ChIPseeker::annotatePeak(counts %>%
                                        tibble::rownames_to_column("peak") %>%
                                        expand_region_string("peak") %>%
                                        as_granges(), TxDb = TxDb.Mmusculus.UCSC.mm10.knownGene)
## >> preparing features information...      2021-10-07 16:46:35 
## >> identifying nearest features...        2021-10-07 16:46:37 
## >> calculating distance from peak to TSS...   2021-10-07 16:46:49 
## >> assigning genomic annotation...        2021-10-07 16:46:49 
## >> assigning chromosome lengths           2021-10-07 16:47:28 
## >> done...                    2021-10-07 16:47:28
plotAnnoPie(annotatedobj)

pdf('PDF_Figure/Annotation_Pie.pdf',
    width = 6,
    height = 4)
plotAnnoPie(annotatedobj)
dev.off()
## quartz_off_screen 
##                 2
upsetplot(annotatedobj)
## Warning: Removed 1660 rows containing non-finite values (stat_count).

pdf('PDF_Figure/UpsetPlot.pdf',
    width = 6,
    height = 4)
upsetplot(annotatedobj)
## Warning: Removed 1660 rows containing non-finite values (stat_count).
dev.off()
## quartz_off_screen 
##                 2

Remove peaks not near TSS

Generally we only care about peaks that are close to gene, here we are pretty lenient and consider peaks that are within 1000 bases of a transcription start site of a gene.

annotated = annotatedobj %>%
  as.GRanges() %>%
  tidy() 
ggplot(annotated, aes(distanceToTSS)) +
  stat_density(geom="line") +
  xlim(c(-10000,10000)) 
## Warning: Removed 317541 rows containing non-finite values (stat_density).

KEEP_DISTANCE = 1000
close_to_tss = annotated %>%
  dplyr::filter(abs(distanceToTSS) < KEEP_DISTANCE) %>%
  pull(peak)
counts = counts[rownames(counts) %in% close_to_tss,]

Most peaks fall close to the TSS, but there are some that are very far away from the TSS. We’ll remove peaks that aren’t anywhere near the TSS for a gene from the analysis. We’ll keep only peaks within 1000 bases of a TSS. This leaves us with 36844 peaks to consider.

Post filtering peak annotation

Here we annotate the peaks post-filtering

annotatedobj = ChIPseeker::annotatePeak(counts %>%
                                        tibble::rownames_to_column("peak") %>%
                                        expand_region_string("peak") %>%
                                        as_granges(), TxDb = TxDb.Mmusculus.UCSC.mm10.knownGene)
## >> preparing features information...      2021-10-07 16:53:45 
## >> identifying nearest features...        2021-10-07 16:53:45 
## >> calculating distance from peak to TSS...   2021-10-07 16:53:47 
## >> assigning genomic annotation...        2021-10-07 16:53:47 
## >> assigning chromosome lengths           2021-10-07 16:53:53 
## >> done...                    2021-10-07 16:53:53
plotAnnoPie(annotatedobj)

pdf('PDF_Figure/Annotation_Pie_filtered.pdf',
    width = 6,
    height = 4)
plotAnnoPie(annotatedobj)
dev.off()
## quartz_off_screen 
##                 2
upsetplot(annotatedobj)

pdf('PDF_Figure/UpsetPlot_filtered.pdf',
    width = 6,
    height = 4)
upsetplot(annotatedobj)
dev.off()
## quartz_off_screen 
##                 2

Differential affinity analysis

Here we look at the condition_Kras_vs_WT coefficient, which will show us the peak affinity differences between the mouse KRasG12D and WT pancreatic epithelial cell samples.

dds = DESeqDataSetFromMatrix(counts, metadata, design=~condition)
## converting counts to integer mode
dds = DESeq(dds)
## estimating size factors
## estimating dispersions
## gene-wise dispersion estimates
## mean-dispersion relationship
## final dispersion estimates
## fitting model and testing
# Print scale factors for BW normalization
1/sizeFactors(dds)
##   WT_tumor_rep1   WT_tumor_rep2   WT_tumor_rep3 Kras_tumor_rep1 Kras_tumor_rep2 
##       0.9023553       1.0468537       1.0241625       1.2142010       0.9657078 
## Kras_tumor_rep3 
##       0.8672074
plotDispEsts(dds)

CUTOFF = 0.05
dds_result <- lfcShrink(dds, coef = "condition_Kras_vs_WT", type = "normal")
## using 'normal' for LFC shrinkage, the Normal prior from Love et al (2014).
## 
## Note that type='apeglm' and type='ashr' have shown to have less bias than type='normal'.
## See ?lfcShrink for more details on shrinkage type, and the DESeq2 vignette.
## Reference: https://doi.org/10.1093/bioinformatics/bty895

QC

DESeq2::plotMA(dds_result)

dds_transform <- varianceStabilizingTransformation(dds)
rawCountTable_transform <- as.data.frame(assay(dds_transform))
pseudoCount_transform = log2(rawCountTable_transform + 1)
mat.dist = pseudoCount_transform
mat.dist = as.matrix(dist(t(mat.dist)))
mat.dist = mat.dist/max(mat.dist)
png('Figure/Hierchical_Clustering_merged.png')
cim(mat.dist, symkey = FALSE, margins = c(7,7))
suppressMessages(dev.off())
## quartz_off_screen 
##                 2
include_graphics('Figure/Hierchical_Clustering_merged.png')

pdf('PDF_Figure/Hierchical_Clustering.pdf',
    width = 6,
    height = 6)
cim(mat.dist, symkey = FALSE, margins = c(7,7))
dev.off()
## quartz_off_screen 
##                 2

The top 500 most variable genes are selected for PCA analysis.

plotPCA(dds_transform, intgroup = "condition", ntop = 500)

pdf('PDF_Figure/PCA.pdf',
    width = 6,
    height = 4)
plotPCA(dds_transform, intgroup = "condition", ntop = 500)
dev.off()
## quartz_off_screen 
##                 2

Output results

Signal to noise ratio is calculated using the definition given by Broad GSEA.

markup_deseq2 = function(res) {
  resanno = res %>%
    expand_region_string("peak") %>%
    as_granges()
  ChIPseeker::annotatePeak(resanno, TxDb=TxDb.Mmusculus.UCSC.mm10.knownGene) %>%
    as.GRanges() %>%
    tidy() %>%
    left_join(symbols, by="geneId")
}

res = dds_result %>%
  as_tibble(rownames = "peak")

# calculate signal-to-noise ratio for GSEA later
s2n <- function(num_list, cond_1 = c(4:6), cond_2 = c(1:3)) {
  mean1 <- mean(num_list[cond_1])
  if (mean1 == 0) {
    mean1 = 1
  }
  mean2 <- mean(num_list[cond_2])
  if (mean2 == 0) {
    mean2 = 1
  }
  sd1 <- sd(num_list[cond_1])
  sd2 <- sd(num_list[cond_2])
  sd1 <- min(sd1, 0.2*abs(mean1))
  sd2 <- min(sd2, 0.2*abs(mean2))
  s2nvalue <- (mean1-mean2)/(sd1+sd2)
  return(s2nvalue)
}

rawCountTable <- as.data.frame(DESeq2::counts(dds, normalize = TRUE)) 

res$s2n <- apply(rawCountTable,1,s2n)

res <- markup_deseq2(res)
## >> preparing features information...      2021-10-07 16:55:11 
## >> identifying nearest features...        2021-10-07 16:55:11 
## >> calculating distance from peak to TSS...   2021-10-07 16:55:13 
## >> assigning genomic annotation...        2021-10-07 16:55:13 
## >> assigning chromosome lengths           2021-10-07 16:55:20 
## >> done...                    2021-10-07 16:55:20
res <- res[!duplicated(res$peak),]

write.csv(res, "Result/ATAC_crc_Kras-WT_merged.csv", row.names = FALSE)

rawCountTable <- rawCountTable %>% rownames_to_column(var = "peak")
rawCountTable <- left_join(rawCountTable, res[,c(6,24)], by = c("peak" ="peak")) 

write.csv(rawCountTable, "Result/ATAC_crc_normalized_count.csv", row.names = FALSE, na = "")

Description of output files

  • start: start of peak
  • end: end of peak
  • width: width of peak
  • strand: strand of peak
  • seqname: chromosome of peak
  • peak: ID of peak
  • baseMean: mean count of reads at peak for all samples
  • estimate: log2 fold change of comparison
  • stderror: standard error of the log2 fold change
  • statistic: value of test statistic (Wald test)
  • p.value: unadjusted p-value
  • p.adjusted: adjusted p-value by BH correction
  • annotation: proximity to nearest gene feature
  • geneChr: chromosome of gene
  • geneStart: start coordinate of gene
  • geneEnd: end coordinate of gene
  • geneId: Entrez gene ID of gene
  • transcriptId: transcript ID
  • distanceToTSS: distance to nearest transcription start site
  • mgi_symbol: gene symbol from MGI

Visualization

Heatmap

res_sig = res %>%
  dplyr::filter(padj < CUTOFF) %>%
  #arrange(pvalue) %>%
  #head(200) %>%
  pull(peak)
ncounts = DESeq2::counts(dds, normalized=TRUE)
pseudoCount = log2(ncounts + 1)

res_sig_counts = ncounts[rownames(ncounts) %in% res_sig,] %>% apply(1,zscore) %>% t()

my_palette <- colorRampPalette(c("blue", "white", "red"))(256)

png('Figure/KRas vs WT crc ATAC-Seq_merged.png',
    width = 600,
    height = 1200,
    res = 150,
    pointsize = 8)
par(cex.main=1.1)
heatmap.2(res_sig_counts,
          main = "Peaks with differential affinity\n in KRasG12D colonic tumor\npadj < 0.05",
          density.info = "none",
          key = TRUE,
          lwid = c(3,7),
          lhei = c(1,7),
          col=my_palette,
          margins = c(15,2),
          symbreaks = TRUE,
          trace = "none",
          dendrogram = "row",
          labRow = FALSE,
          ylab = "Genes",
          cexCol = 2,
          Colv = "NA")
dev.off()
## quartz_off_screen 
##                 2
include_graphics('Figure/KRas vs WT crc ATAC-Seq_merged.png')

pdf('PDF_Figure/Heatmap.pdf',
    width = 6,
    height = 12)
par(cex.main=1.1)
heatmap.2(res_sig_counts,
          main = "Peaks with differential affinity\n in KRasG12D colonic tumor\npadj < 0.05",
          density.info = "none",
          key = TRUE,
          lwid = c(3,7),
          lhei = c(1,7),
          col=my_palette,
          margins = c(15,2),
          symbreaks = TRUE,
          trace = "none",
          dendrogram = "row",
          labRow = FALSE,
          ylab = "Genes",
          cexCol = 2,
          Colv = "NA")
dev.off()
## quartz_off_screen 
##                 2

Scatter Plot

# Scatter plot
res$kras_mean <- rowMeans(pseudoCount[,4:6])
res$wt_mean <- rowMeans(pseudoCount[,1:3])
ggplot(res, aes(x = wt_mean, y = kras_mean)) +
  xlab("WT_Average(log2)") + ylab("KRas_Average(log2)") + 
  geom_point(data = res, alpha = 0.5, size = 1, color = "grey") +
  geom_point(data = subset(res, padj < 0.05 & log2FoldChange > 0), alpha = 0.5, size = 1, color = "red") +
  geom_point(data = subset(res, padj < 0.05 & log2FoldChange < 0), alpha = 0.5, size = 1, color = "blue") +
  labs(title = "KRasG12D vs WT ATAC-Seq Scatter Plot")

pdf('PDF_Figure/Scatter_Plot.pdf',
    width = 5,
    height = 4)
ggplot(res, aes(x = wt_mean, y = kras_mean)) +
  xlab("WT_Average(log2)") + ylab("KRas_Average(log2)") + 
  geom_point(data = res, alpha = 0.5, size = 1, color = "grey") +
  geom_point(data = subset(res, padj < 0.05 & log2FoldChange > 0), alpha = 0.5, size = 1, color = "red") +
  geom_point(data = subset(res, padj < 0.05 & log2FoldChange < 0), alpha = 0.5, size = 1, color = "blue") +
  labs(title = "KRasG12D vs WT ATAC-Seq Scatter Plot")
dev.off()
## quartz_off_screen 
##                 2

MA Plot

# MA plot
ggplot(res, aes(x = log(baseMean,2), y = log2FoldChange,)) +
  xlab("Average Expression") + ylab("LFC") + 
  geom_point(data = res, alpha = 0.5, size = 1, color = "grey") +
  geom_point(data = subset(res, padj < 0.05 & log2FoldChange > 0), alpha = 0.5, size = 1, color = "red") +
  geom_point(data = subset(res, padj < 0.05 & log2FoldChange < 0), alpha = 0.5, size = 1, color = "blue") +
labs(title = "KRas vs WT pancreas ATAC-Seq MA Plot")

pdf('PDF_Figure/MA_Plot.pdf',
    width = 5,
    height = 4)
ggplot(res, aes(x = log(baseMean,2), y = log2FoldChange,)) +
  xlab("Average Expression") + ylab("LFC") + 
  geom_point(data = res, alpha = 0.5, size = 1, color = "grey") +
  geom_point(data = subset(res, padj < 0.05 & log2FoldChange > 0), alpha = 0.5, size = 1, color = "red") +
  geom_point(data = subset(res, padj < 0.05 & log2FoldChange < 0), alpha = 0.5, size = 1, color = "blue") +
labs(title = "KRas vs WT pancreas ATAC-Seq MA Plot")
dev.off()
## quartz_off_screen 
##                 2

Volcano Plot

# Volcano Plot
ggplot(res, aes(x = log2FoldChange, y = -log(padj,10))) +
  xlab("LFC") + ylab("-log10(P value)") + 
  geom_point(data = res, alpha = 0.5, size = 1, color = "black") +
  geom_point(data = subset(res, padj < 0.05 & log2FoldChange > 0), alpha = 0.5, size = 1, color = "red") +
  geom_point(data = subset(res, padj < 0.05 & log2FoldChange < 0), alpha = 0.5, size = 1, color = "blue") +
labs(title = "KRas vs WT pancreas ATAC-Seq Volcano Plot")
## Warning: Removed 17858 rows containing missing values (geom_point).

pdf('PDF_Figure/Volcano_Plot.pdf',
    width = 5,
    height = 4)
ggplot(res, aes(x = log2FoldChange, y = -log(padj,10))) +
  xlab("LFC") + ylab("-log10(P value)") + 
  geom_point(data = res, alpha = 0.5, size = 1, color = "black") +
  geom_point(data = subset(res, padj < 0.05 & log2FoldChange > 0), alpha = 0.5, size = 1, color = "red") +
  geom_point(data = subset(res, padj < 0.05 & log2FoldChange < 0), alpha = 0.5, size = 1, color = "blue") +
labs(title = "KRas vs WT pancreas ATAC-Seq Volcano Plot")
## Warning: Removed 17858 rows containing missing values (geom_point).
dev.off()
## quartz_off_screen 
##                 2

Pathway Enrichment

GO

Classic GO analysis is performed here for all DE peak-genes detected in this dataset. The reference list is list of peak-genes detected in ATAC-Seq. Three categories of GO terms are tested here, including biological process, molecular function and cellular component.

dir.create("Result/GO", showWarnings = FALSE)

target_gene <- res %>% dplyr::filter(padj < 0.05) %>% dplyr::select(ensembl_gene_id) %>% unlist() %>% as.character()
detected_gene <- res %>% dplyr::select(ensembl_gene_id) %>% unlist() %>% as.character()

# Run GO enrichment analysis for biological process
ego_BP <- enrichGO(gene = target_gene, 
                universe = detected_gene,
                keyType = "ENSEMBL",
                OrgDb = org.Mm.eg.db, 
                ont = "BP", 
                pAdjustMethod = "BH", 
                pvalueCutoff = 0.05, 
                readable = TRUE)

# Output results from GO analysis to a table
cluster_summary_BP <- data.frame(ego_BP)

write.csv(cluster_summary_BP, "Result/GO/GO analysis_BP_merged.csv")

# Run GO enrichment analysis for molecular function 
ego_MF <- enrichGO(gene = target_gene, 
                universe = detected_gene,
                keyType = "ENSEMBL",
                OrgDb = org.Mm.eg.db, 
                ont = "MF", 
                pAdjustMethod = "BH", 
                pvalueCutoff = 0.05, 
                readable = TRUE)

# Output results from GO analysis to a table
cluster_summary_MF <- data.frame(ego_MF)

write.csv(cluster_summary_MF, "Result/GO/GO analysis_MF_merged.csv")

# Run GO enrichment analysis for cellular component 
ego_CC <- enrichGO(gene = target_gene, 
                universe = detected_gene,
                keyType = "ENSEMBL",
                OrgDb = org.Mm.eg.db, 
                ont = "CC", 
                pAdjustMethod = "BH", 
                pvalueCutoff = 0.05, 
                readable = TRUE)

# Output results from GO analysis to a table
cluster_summary_CC <- data.frame(ego_CC)

write.csv(cluster_summary_CC, "Result/GO/GO analysis_CC_merged.csv")

Biological Process

pdf("PDF_Figure/GO_BP_Dotplot.pdf",
    width = 10,
    height = 10)
dotplot(ego_BP, showCategory=20)
dev.off()
## quartz_off_screen 
##                 2
include_graphics('PDF_Figure/GO_BP_Dotplot.pdf')

ego_BP <- pairwise_termsim(ego_BP)
pdf("PDF_Figure/GO_BP_eMAPplot.pdf",
    width = 10,
    height = 10)
emapplot(ego_BP, showCategory = 50)
dev.off()
## quartz_off_screen 
##                 2
include_graphics('PDF_Figure/GO_BP_eMAPplot.pdf')

OE_foldchanges <- res %>% dplyr::filter(padj < 0.05) %>% dplyr::select(c(log2FoldChange, mgi_symbol))
OE <- OE_foldchanges$log2FoldChange %>% unlist()
names(OE) <- OE_foldchanges$mgi_symbol

pdf("PDF_Figure/GO_BP_cNetplot.pdf",
    width = 10,
    height = 10)
cnetplot(ego_BP, 
         categorySize="pvalue", 
         showCategory = 5, 
         foldChange=OE, 
         vertex.label.font=6)
dev.off()
## quartz_off_screen 
##                 2
include_graphics('PDF_Figure/GO_BP_cNetplot.pdf')

Molecular Function

pdf("PDF_Figure/GO_MF_Dotplot.pdf",
    width = 10,
    height = 5)
dotplot(ego_MF, showCategory=10)
dev.off()
## quartz_off_screen 
##                 2
include_graphics('PDF_Figure/GO_MF_Dotplot.pdf')

ego_MF <- pairwise_termsim(ego_MF)
pdf("PDF_Figure/GO_MF_eMAPplot.pdf",
    width = 10,
    height = 10)
emapplot(ego_MF, showCategory = 50)
dev.off()
## quartz_off_screen 
##                 2
include_graphics('PDF_Figure/GO_MF_eMAPplot.pdf')

pdf("PDF_Figure/GO_MF_cNetplot.pdf",
    width = 10,
    height = 10)
cnetplot(ego_MF, 
         categorySize="pvalue", 
         showCategory = 5, 
         foldChange=OE, 
         vertex.label.font=6)
dev.off()
## quartz_off_screen 
##                 2
include_graphics('PDF_Figure/GO_MF_cNetplot.pdf')

Cellular Component

pdf("PDF_Figure/GO_CC_Dotplot.pdf",
    width = 10,
    height = 5)
dotplot(ego_CC, showCategory=10)
dev.off()
## quartz_off_screen 
##                 2
include_graphics('PDF_Figure/GO_CC_Dotplot.pdf')

ego_CC <- pairwise_termsim(ego_CC)
pdf("PDF_Figure/GO_CC_eMAPplot.pdf",
    width = 10,
    height = 10)
emapplot(ego_CC, showCategory = 50)
dev.off()
## quartz_off_screen 
##                 2
include_graphics('PDF_Figure/GO_CC_eMAPplot.pdf')

pdf("PDF_Figure/GO_CC_cNetplot.pdf",
    width = 10,
    height = 10)
cnetplot(ego_CC, 
         categorySize="pvalue", 
         showCategory = 5, 
         foldChange=OE, 
         vertex.label.font=6)
dev.off()
## quartz_off_screen 
##                 2
include_graphics('PDF_Figure/GO_CC_cNetplot.pdf')

KEGG

target_gene <- res %>% dplyr::filter(padj < 0.05) %>% dplyr::select(geneId) %>% unlist() %>% as.character()
detected_gene <- res %>% dplyr::select(geneId) %>% unlist() %>% as.character()

kk <- enrichKEGG(gene = target_gene, 
                 universe = detected_gene,
                 organism = 'mmu',
                 keyType = "kegg",
                 pAdjustMethod = "BH", 
                 pvalueCutoff = 0.05)
## Reading KEGG annotation online:
## 
## Reading KEGG annotation online:
kk.result <- as.data.frame(kk)

pdf("PDF_Figure/KEGG_Dotplot.pdf",
    width = 10,
    height = 5)
dotplot(kk, showCategory=10)
dev.off()
## quartz_off_screen 
##                 2
dir.create("Result/KEGG", showWarnings = FALSE)
write.csv(kk.result, "Result/KEGG/KEGG analysis_merged.csv")

Motif Enrichment via HOMER

The BED files are prepared in R. HOMER was run in the terminal. ## Prepare function for HOMER result parsing

plotHomerResults <- function(homer.table, homer.pwms, n.motifs = 20) {
    ncol <- 4
    laymat <- matrix(1:((1 + n.motifs) * ncol), ncol = ncol, byrow = FALSE)
    logos.list <- lapply(homer.pwms[1:n.motifs],
                         function(pwm) {
                             ggseqlogo(pwm) +
                                 theme(axis.text.x = element_blank(),
                                       axis.title.y = element_blank(),
                                       axis.line.y =
                                           element_line(color = "gray"),
                                       axis.ticks.y =
                                           element_line(color = "gray")) +
                                 ylim(0, 2)
                         })
    ranks.text <- sapply(homer.table$Rank[1:n.motifs], textGrob,
                         simplify = FALSE)
    pval.text <- sapply(sprintf("%.0f", -homer.table$log10.p)[1:n.motifs],
                        textGrob, simplify = FALSE)
    targ.text <- sapply(homer.table$freq.targets[1:n.motifs], textGrob,
                        simplify = FALSE)
    bg.text <- sapply(homer.table$freq.bg[1:n.motifs], textGrob,
                      simplify = FALSE)
    tf.text <- sapply(homer.table$best.match.simple[1:n.motifs], textGrob,
                      simplify = FALSE)
    headers <- sapply(c("rank", "motif", "-log10\np-value",
                        "freq.\ntargets", "freq.\nbackgr.",
                        "best match"),
                      textGrob,
                      simplify = FALSE)
    all.plots <- c(headers[1], ranks.text,
                   headers[2], logos.list,
                   headers[3], pval.text,
                   # headers[4], targ.text,
                   # headers[5], bg.text,
                   headers[6], tf.text)
    grid.arrange(grobs = all.plots, layout_matrix = laymat,
                 # widths = c(1, 4, 1, 1, 1, 3),
                 widths = c(1, 4, 1, 3),
                 ncol = ncol)
}

loadPWM <- function(filename) {
    motif <- fread(filename, skip = 1)
    if (nrow(motif) > 0) {
        pwm <- t(as.matrix(as.data.frame(motif)))
        rownames(pwm) <- c("A", "C", "G", "T")
        pwm
    } else {
        NULL
    }
}

loadHomerResults <- function(dirname) {
    # also allow version="extended" for motifs without stringent
    #   similarity filtering
    homer.table <-
        html_nodes(read_html(sprintf("%s/homerResults.html", dirname)), "table")
    homer.table <- html_table(homer.table, header = TRUE)[[1]]
    homer.table <- data.table(homer.table, check.names = TRUE)
    homer.table <- homer.table[, .(Rank,
                                   log10.p = log.P.pvalue / log(10),
                                   freq.targets = X..of.Targets,
                                   freq.bg = X..of.Background,
                                   best.match = Best.Match.Details)]
    homer.table[, filename := sprintf("%s/homerResults/motif%s.motif",
                                      dirname, seq_along(Rank))]
    homer.table[, best.match.simple := sapply(strsplit(homer.table$best.match, "/"), "[", 1)]
    homer.pwms <- sapply(homer.table$filename, loadPWM, simplify = FALSE)
    list(homer.table, homer.pwms)
}

Up-regulated peaks in KRasG12D pancreas

Actual HOMER run was done on O2

res.up <- res %>% dplyr::filter(padj < CUTOFF & log2FoldChange > 0) %>% GRanges()
dir.create("Result/Kmer", showWarnings = FALSE)
rtracklayer::export(res.up, "Result/Kmer/KRas_up_ATAC_peaks_merged.bed")

genomic.regions <- "Result/Kmer/KRas_up_ATAC_peaks_merged.bed"
homerdir <- "Result/Kmer/homer-denovo-output-KRas_up_peaks"
dir.create(homerdir, showWarnings = FALSE)

homer.cmd <- sprintf("findMotifsGenome.pl %s mm10 %s -len 8 -size given -p 10 -bits -cache 1000 >.homer-output 2>.err.homer-output", genomic.regions, homerdir)

print(homer.cmd)
## [1] "findMotifsGenome.pl Result/Kmer/KRas_up_ATAC_peaks_merged.bed mm10 Result/Kmer/homer-denovo-output-KRas_up_peaks -len 8 -size given -p 10 -bits -cache 1000 >.homer-output 2>.err.homer-output"
up.peaks.homer.res <- loadHomerResults(homerdir)
print(up.peaks.homer.res[[1]]$best.match.simple)
## [1] "HOXC13"                  "SD0001.1_at_AC_acceptor"
## [3] "MYB"                     "RUNX-AML(Runt)"         
## [5] "POL009.1_DCE_S_II"       "ZBTB32"                 
## [7] "PRDM4"
plotHomerResults(up.peaks.homer.res[[1]],
                 up.peaks.homer.res[[2]], n.motifs = 6)
## Warning: `guides(<scale> = FALSE)` is deprecated. Please use `guides(<scale> =
## "none")` instead.

## Warning: `guides(<scale> = FALSE)` is deprecated. Please use `guides(<scale> =
## "none")` instead.

## Warning: `guides(<scale> = FALSE)` is deprecated. Please use `guides(<scale> =
## "none")` instead.

## Warning: `guides(<scale> = FALSE)` is deprecated. Please use `guides(<scale> =
## "none")` instead.

## Warning: `guides(<scale> = FALSE)` is deprecated. Please use `guides(<scale> =
## "none")` instead.

## Warning: `guides(<scale> = FALSE)` is deprecated. Please use `guides(<scale> =
## "none")` instead.

Down-regulated peaks in KRasG12D pancreas

Actual HOMER run was done on O2

res.down <- res %>% dplyr::filter(padj < CUTOFF & log2FoldChange < 0) %>% GRanges()
dir.create("Result/Kmer", showWarnings = FALSE)
rtracklayer::export(res.up, "Result/Kmer/KRas_down_ATAC_peaks_merged.bed")

genomic.regions <- "Result/Kmer/KRas_down_ATAC_peaks_merged.bed"
homerdir <- "Result/Kmer/homer-denovo-output-KRas_down_peaks"
dir.create(homerdir, showWarnings = FALSE)

homer.cmd <- sprintf("findMotifsGenome.pl %s mm10 %s -len 8 -size given -p 10 -bits -cache 1000 >.homer-output 2>.err.homer-output", genomic.regions, homerdir)

print(homer.cmd)
## [1] "findMotifsGenome.pl Result/Kmer/KRas_down_ATAC_peaks_merged.bed mm10 Result/Kmer/homer-denovo-output-KRas_down_peaks -len 8 -size given -p 10 -bits -cache 1000 >.homer-output 2>.err.homer-output"
down.peaks.homer.res <- loadHomerResults(homerdir)
print(down.peaks.homer.res[[1]]$best.match.simple)
## [1] "HOXC13"                  "SD0001.1_at_AC_acceptor"
## [3] "SOX15"                   "POL009.1_DCE_S_II"      
## [5] "ZBTB32"                  "PB0208.1_Zscan4_2"
plotHomerResults(down.peaks.homer.res[[1]],
                 down.peaks.homer.res[[2]], n.motifs = 6)
## Warning: `guides(<scale> = FALSE)` is deprecated. Please use `guides(<scale> =
## "none")` instead.

## Warning: `guides(<scale> = FALSE)` is deprecated. Please use `guides(<scale> =
## "none")` instead.

## Warning: `guides(<scale> = FALSE)` is deprecated. Please use `guides(<scale> =
## "none")` instead.

## Warning: `guides(<scale> = FALSE)` is deprecated. Please use `guides(<scale> =
## "none")` instead.

## Warning: `guides(<scale> = FALSE)` is deprecated. Please use `guides(<scale> =
## "none")` instead.

## Warning: `guides(<scale> = FALSE)` is deprecated. Please use `guides(<scale> =
## "none")` instead.

Session Info

sessionInfo()
## R version 4.1.1 (2021-08-10)
## Platform: x86_64-apple-darwin17.0 (64-bit)
## Running under: macOS Catalina 10.15.4
## 
## Matrix products: default
## BLAS:   /Library/Frameworks/R.framework/Versions/4.1/Resources/lib/libRblas.0.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/4.1/Resources/lib/libRlapack.dylib
## 
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
## 
## attached base packages:
##  [1] grid      parallel  stats4    stats     graphics  grDevices utils    
##  [8] datasets  methods   base     
## 
## other attached packages:
##  [1] enrichplot_1.12.2                        
##  [2] data.table_1.14.2                        
##  [3] Rsamtools_2.8.0                          
##  [4] Biostrings_2.60.2                        
##  [5] XVector_0.32.0                           
##  [6] rtracklayer_1.52.1                       
##  [7] gridExtra_2.3                            
##  [8] ggseqlogo_0.1                            
##  [9] rvest_1.0.1                              
## [10] fgsea_1.18.0                             
## [11] pathview_1.32.0                          
## [12] org.Mm.eg.db_3.13.0                      
## [13] DOSE_3.18.3                              
## [14] clusterProfiler_4.0.5                    
## [15] mosaic_1.8.3                             
## [16] ggridges_0.5.3                           
## [17] mosaicData_0.20.2                        
## [18] ggformula_0.10.1                         
## [19] ggstance_0.3.5                           
## [20] Matrix_1.3-4                             
## [21] mixOmics_6.16.3                          
## [22] lattice_0.20-45                          
## [23] MASS_7.3-54                              
## [24] TxDb.Mmusculus.UCSC.mm10.knownGene_3.10.0
## [25] GenomicFeatures_1.44.2                   
## [26] AnnotationDbi_1.54.1                     
## [27] ChIPseeker_1.28.3                        
## [28] biomaRt_2.48.3                           
## [29] gplots_3.1.1                             
## [30] RColorBrewer_1.1-2                       
## [31] ggrepel_0.9.1                            
## [32] biobroom_1.24.0                          
## [33] broom_0.7.9                              
## [34] plyranges_1.12.1                         
## [35] kableExtra_1.3.4                         
## [36] knitr_1.36                               
## [37] DESeq2_1.32.0                            
## [38] SummarizedExperiment_1.22.0              
## [39] Biobase_2.52.0                           
## [40] MatrixGenerics_1.4.3                     
## [41] matrixStats_0.61.0                       
## [42] GenomicRanges_1.44.0                     
## [43] GenomeInfoDb_1.28.4                      
## [44] IRanges_2.26.0                           
## [45] S4Vectors_0.30.2                         
## [46] BiocGenerics_0.38.0                      
## [47] forcats_0.5.1                            
## [48] stringr_1.4.0                            
## [49] dplyr_1.0.7                              
## [50] purrr_0.3.4                              
## [51] readr_2.0.2                              
## [52] tidyr_1.1.4                              
## [53] tibble_3.1.5                             
## [54] ggplot2_3.3.5                            
## [55] tidyverse_1.3.1                          
## 
## loaded via a namespace (and not attached):
##   [1] utf8_1.2.2                             
##   [2] tidyselect_1.1.1                       
##   [3] htmlwidgets_1.5.4                      
##   [4] RSQLite_2.2.8                          
##   [5] BiocParallel_1.26.2                    
##   [6] scatterpie_0.1.7                       
##   [7] munsell_0.5.0                          
##   [8] withr_2.4.2                            
##   [9] colorspace_2.0-2                       
##  [10] GOSemSim_2.18.1                        
##  [11] filelock_1.0.2                         
##  [12] highr_0.9                              
##  [13] rstudioapi_0.13                        
##  [14] labeling_0.4.2                         
##  [15] KEGGgraph_1.52.0                       
##  [16] GenomeInfoDbData_1.2.6                 
##  [17] polyclip_1.10-0                        
##  [18] bit64_4.0.5                            
##  [19] farver_2.1.0                           
##  [20] downloader_0.4                         
##  [21] vctrs_0.3.8                            
##  [22] treeio_1.16.2                          
##  [23] generics_0.1.0                         
##  [24] xfun_0.26                              
##  [25] BiocFileCache_2.0.0                    
##  [26] R6_2.5.1                               
##  [27] graphlayouts_0.7.1                     
##  [28] locfit_1.5-9.4                         
##  [29] bitops_1.0-7                           
##  [30] cachem_1.0.6                           
##  [31] gridGraphics_0.5-1                     
##  [32] DelayedArray_0.18.0                    
##  [33] assertthat_0.2.1                       
##  [34] vroom_1.5.5                            
##  [35] BiocIO_1.2.0                           
##  [36] scales_1.1.1                           
##  [37] ggraph_2.0.5                           
##  [38] gtable_0.3.0                           
##  [39] tidygraph_1.2.0                        
##  [40] rlang_0.4.11                           
##  [41] genefilter_1.74.0                      
##  [42] systemfonts_1.0.2                      
##  [43] splines_4.1.1                          
##  [44] lazyeval_0.2.2                         
##  [45] selectr_0.4-2                          
##  [46] mosaicCore_0.9.0                       
##  [47] yaml_2.2.1                             
##  [48] reshape2_1.4.4                         
##  [49] modelr_0.1.8                           
##  [50] crosstalk_1.1.1                        
##  [51] backports_1.2.1                        
##  [52] qvalue_2.24.0                          
##  [53] tools_4.1.1                            
##  [54] ggplotify_0.1.0                        
##  [55] ellipsis_0.3.2                         
##  [56] jquerylib_0.1.4                        
##  [57] ggdendro_0.1.22                        
##  [58] Rcpp_1.0.7                             
##  [59] plyr_1.8.6                             
##  [60] progress_1.2.2                         
##  [61] zlibbioc_1.38.0                        
##  [62] RCurl_1.98-1.5                         
##  [63] prettyunits_1.1.1                      
##  [64] viridis_0.6.1                          
##  [65] cowplot_1.1.1                          
##  [66] haven_2.4.3                            
##  [67] fs_1.5.0                               
##  [68] magrittr_2.0.1                         
##  [69] RSpectra_0.16-0                        
##  [70] DO.db_2.9                              
##  [71] reprex_2.0.1                           
##  [72] ggnewscale_0.4.5                       
##  [73] hms_1.1.1                              
##  [74] patchwork_1.1.1                        
##  [75] evaluate_0.14                          
##  [76] xtable_1.8-4                           
##  [77] leaflet_2.0.4.1                        
##  [78] XML_3.99-0.8                           
##  [79] readxl_1.3.1                           
##  [80] ggupset_0.3.0                          
##  [81] compiler_4.1.1                         
##  [82] ellipse_0.4.2                          
##  [83] shadowtext_0.0.9                       
##  [84] KernSmooth_2.23-20                     
##  [85] crayon_1.4.1                           
##  [86] htmltools_0.5.2                        
##  [87] corpcor_1.6.10                         
##  [88] ggfun_0.0.4                            
##  [89] tzdb_0.1.2                             
##  [90] geneplotter_1.70.0                     
##  [91] aplot_0.1.1                            
##  [92] lubridate_1.7.10                       
##  [93] DBI_1.1.1                              
##  [94] tweenr_1.0.2                           
##  [95] dbplyr_2.1.1                           
##  [96] rappdirs_0.3.3                         
##  [97] boot_1.3-28                            
##  [98] cli_3.0.1                              
##  [99] igraph_1.2.6                           
## [100] pkgconfig_2.0.3                        
## [101] TxDb.Hsapiens.UCSC.hg19.knownGene_3.2.2
## [102] GenomicAlignments_1.28.0               
## [103] xml2_1.3.2                             
## [104] rARPACK_0.11-0                         
## [105] ggtree_3.0.4                           
## [106] svglite_2.0.0                          
## [107] annotate_1.70.0                        
## [108] webshot_0.5.2                          
## [109] yulab.utils_0.0.2                      
## [110] digest_0.6.28                          
## [111] graph_1.70.0                           
## [112] rmarkdown_2.11                         
## [113] cellranger_1.1.0                       
## [114] fastmatch_1.1-3                        
## [115] tidytree_0.3.5                         
## [116] restfulr_0.0.13                        
## [117] curl_4.3.2                             
## [118] gtools_3.9.2                           
## [119] rjson_0.2.20                           
## [120] lifecycle_1.0.1                        
## [121] nlme_3.1-153                           
## [122] jsonlite_1.7.2                         
## [123] viridisLite_0.4.0                      
## [124] fansi_0.5.0                            
## [125] labelled_2.8.0                         
## [126] pillar_1.6.3                           
## [127] plotrix_3.8-2                          
## [128] KEGGREST_1.32.0                        
## [129] fastmap_1.1.0                          
## [130] httr_1.4.2                             
## [131] survival_3.2-13                        
## [132] GO.db_3.13.0                           
## [133] glue_1.4.2                             
## [134] png_0.1-7                              
## [135] Rgraphviz_2.36.0                       
## [136] bit_4.0.4                              
## [137] ggforce_0.3.3                          
## [138] stringi_1.7.5                          
## [139] blob_1.2.2                             
## [140] org.Hs.eg.db_3.13.0                    
## [141] caTools_1.18.2                         
## [142] memoise_2.0.0                          
## [143] ape_5.5